import requests
from bs4 import BeautifulSoup
import re
from pydash import py_
import threading
import csv
import jieba

OPENGAUES_URL_HEAD = 'https://opengauss.org'
SPECICAL_CONTENT = '本特性自openGauss' # 有到页面什么也没有，除了这行字，需要特殊处理下
ROOT_PATH = '/zh/docs/3.0.0/docs/BriefTutorial/%E4%BB%80%E4%B9%88%E6%98%AFopenGauss.html'

datasets = []

def stopwordslist():
    stopwords = [line.strip() for line in open('./stopwords_zh.txt',encoding='UTF-8').readlines()]
    return stopwords

# 创建一个停用词列表
STOPWORDS = stopwordslist()

# 标签不知道怎么取，文章里根本没标签，只能用 jieba分词把title先分词，然后剔除stopwords,生成标签
def tags(stopwords, line):
    line_seg = seg_depart(stopwords, line)
    dicts = line_seg.split()
    tags = '｜'.join(dicts)
    return tags

def seg_depart(stopwords, sentence):
    # 对文档中的每一行进行中文分词
    # print("正在分词")
    sentence_depart = jieba.cut(sentence.strip())
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

# 线程处理函数
def handler(links):
    print('** [Starting] ** ')
    index = 0
    for link in links:
        index += 1
        print('  >> Progress: ' + str(index) + '/' + str(len(links)))
        print(link.get('href'))
        link = OPENGAUES_URL_HEAD + link.get('href')
        
        try:
            doc_req = requests.get(url=link)
        except:
            # 有的link打不开
            print('******** Request Error ********')
            return
        doc_req.encoding = 'utf-8'
        doc_html = doc_req.text
        doc_bs = BeautifulSoup(doc_html, "html.parser")
        markdowns = doc_bs.select('article')
        markdown = markdowns[0]
        h1 = markdown.find('h1')
        content_objs = []
        # 有h1标签的，用h1标签做title, 还得剔除特殊页面
        if(not h1):
            special_markdiowns = markdown.find_all('p')
            if(special_markdiowns and len(special_markdiowns) > 1):
                h1 = special_markdiowns[0]
                content_obj = special_markdiowns[1]
        else: # 没有h1标签的，要查找第一个p标签做title，还要处理空页面
            content_objs = markdown.find_all('p')
            if(content_objs and len(content_objs)):
                content_obj = content_objs[0]
            else:
                print('******** Can not find content ******** ' + str(len(content_objs)))
                return    
        title = h1.contents[0]    
        content = content_obj.getText()
        if(content.__contains__(SPECICAL_CONTENT) and len(content_objs)):
            content = content_objs[1].getText()
        print(title)
        print(content)
        datasets.append([title, content, link, title, tags(STOPWORDS, title)])

def mul_thread_handler(links):
    thread = threading.Thread(target=handler, args=([links]))
    thread.start()
    return thread

url = [OPENGAUES_URL_HEAD + ROOT_PATH]
req = requests.get(url=url[0])
req.encoding = 'utf-8'
html = req.text
bs = BeautifulSoup(html, "html.parser")#html.parser是解析器

# 不处理gitee.com
def with_docs(href):
    return href and re.compile("zh/docs").search(href) and not re.compile("gitee.com").search(href)

all_links = bs.find_all(href=with_docs)

# 以下不是县城数量啊，把一个大数组分成100个一份的若干数组，线程数量=大数组长度/chunk_size
chunk_size = 5
chunk = py_.chunk(all_links, chunk_size)

threads = py_(chunk).map(mul_thread_handler).value()

def join_all(thread):
    thread.join()

py_(threads).map(join_all).value()

print('***************************** At last *****************************')

# 写入数据, 这里直接追加到data/data.csv了
# headers = ('title', 'content', 'link', 'process', 'tag')
with open('../data/data.csv', 'a', encoding='utf-8', newline='') as f:
    write = csv.writer(f)  # 创建writer对象
    # write.writerow(headers)
    for n in range(len(datasets)):
        write.writerow(datasets[n])
